{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5939b34f",
   "metadata": {},
   "source": [
    "# 0. 原指导书中给出的示例代码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "37de5adb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\lib\\site-packages\\elasticsearch\\connection\\base.py:209: ElasticsearchWarning: Elasticsearch built-in security features are not enabled. Without authentication, your cluster could be accessible to anyone. See https://www.elastic.co/guide/en/elasticsearch/reference/7.15/security-minimal-setup.html to enable security.\n",
      "  warnings.warn(message, category=ElasticsearchWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'author': 'Information Retrieval', 'text': 'Test for ElasticSearch', 'timestamp': '2021-11-12T18:00:07.867153'}\n",
      "Got 1 Hits:\n",
      "2021-11-12T18:00:07.867153 Information Retrieval: Test for ElasticSearch\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-22-1a562ced9937>:22: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
      "  res = es.search(index=\"test-index\", body=q1)\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch.client import Elasticsearch\n",
    "from datetime import datetime\n",
    "es = Elasticsearch([{'host':'127.0.0.1','port':9200}], timeout=3600)\n",
    "doc = {\n",
    "    'author': 'Information Retrieval',\n",
    "    'text': 'Test for ElasticSearch',\n",
    "    'timestamp': datetime.now(),\n",
    "}\n",
    "if __name__ == \"__main__\":\n",
    "    res = es.index(index=\"test-index\", id=1, document=doc)\n",
    "    #print(res['result']) 输出update\n",
    "    res = es.get(index=\"test-index\", id=1)\n",
    "    print(res['_source'])  #输出原始内容\n",
    "    es.indices.refresh(index=\"test-index\")\n",
    "    q1 = {\n",
    "        \"query\": {\n",
    "            \"match\":{\n",
    "                \"author\" : \"Information Retrieval\"\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "    res = es.search(index=\"test-index\", body=q1)\n",
    "    print(\"Got %d Hits:\" % res['hits']['total']['value'])\n",
    "    for hit in res['hits']['hits']:\n",
    "        print(\"%(timestamp)s %(author)s: %(text)s\" % hit[\"_source\"])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "becd855c",
   "metadata": {},
   "source": [
    "# 1. 对少量邮件进行格式转化、索引构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "18ec6c61",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-66-acaa55ac94e3>:23: DeprecationWarning: Using positional arguments for APIs is deprecated and will be disabled in 8.0.0. Instead use only keyword arguments for all APIs. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
      "  es.indices.delete('index1')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 1.89712, 'hits': [{'_index': 'index1', '_type': '_doc', '_id': '2', '_score': 1.89712, '_source': {'Message-ID': '<19129466.1075855378242.JavaMail.evans@thyme>', 'Date': 'Thu, 10 May 2001 12:50:00 -0700 (PDT)', 'From': 'phillip.allen@enron.com', 'To': 'jsmith@austintx.com', 'Subject': '', 'Mime-Version': '1.0', 'Content-Type': 'text/plain; charset=us-ascii', 'Content-Transfer-Encoding': '7bit', 'X-From': 'Phillip K Allen', 'X-To': 'jsmith <jsmith@austintx.com>', 'X-cc': '', 'X-bcc': '', 'X-Folder': \"\\\\Phillip_Allen_Jan2002_1\\\\Allen, Phillip K.\\\\'Sent Mail\", 'X-Origin': 'Allen-P', 'X-FileName': 'pallen (Non-Privileged).pst'}}]}}\n",
      "\n",
      "\n",
      "From: phillip.allen@enron.com  To:jsmith@austintx.com , id2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-66-acaa55ac94e3>:40: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
      "  res1 = es.search(index=\"index1\", body=query,params=params)\n"
     ]
    }
   ],
   "source": [
    "#一些简单测试\n",
    "from email.parser import Parser\n",
    "import os\n",
    "\n",
    "def em2js(path,e_id): \n",
    "    \"\"\"\n",
    "    这一函数可以实现从email格式文件到json格式的转变和索引构建、文件插入\n",
    "    返回json格式的信息jres\n",
    "    \n",
    "    path：需要构建索引的文件所在路径\n",
    "    e_id：为邮件在索引中分配的id\n",
    "    \n",
    "    keyname：邮件的关键字列表，包括信息原始ID、发信时间、收发人等内容\n",
    "    \"\"\"\n",
    "    keyname=['Message-ID','Date','From','To','Subject','Mime-Version','X-From','X-To','X-Folder','X-Origin','X-FileName']\n",
    "    if os.path.exists(path):\n",
    "        with open(path) as fp:\n",
    "            em=fp.read()\n",
    "            header=Parser().parsestr(em)# 将符合格式的email文件解析成email字典\n",
    "            jres={}\n",
    "            klen=len(keyname)\n",
    "            for i in range(klen): # 按照keyname列表，转化为json\n",
    "                item={keyname[i]:header[keyname[i]]}\n",
    "                jres.update(item)\n",
    "            es.index(index=\"em-index1\", id=e_id, document=jres) #插入索引当中，索引名为em-index1\n",
    "            es.indices.refresh(index=\"em-index1\")\n",
    "            return jres\n",
    "    else:\n",
    "        print(\"no file!\")\n",
    "\n",
    "\n",
    "es.indices.delete('index1')\n",
    "jres1=em2js(\"maildir/allen-p/deleted_items/1\",1)\n",
    "jres2=em2js(\"maildir/allen-p/_sent_mail/4\",2)\n",
    "jres3=em2js(\"maildir/arora-h/all_documents/1\",3)\n",
    "jres4=em2js(\"maildir/badeer-r/all_documents/3\",4)\n",
    "#print(\"X-Cc:%s\"%header['X-Cc'])\n",
    "\n",
    "\n",
    "\n",
    "query = {\n",
    "    \"query\": {\n",
    "        \"match_phrase\":{\n",
    "            \"From\" : \"phillip.allen@enron.com\"\n",
    "        }#如果用match，则会把所有邮件都返回，因为es将点号视为分隔，所有含com的邮件都会返回\n",
    "    }#如果用term，则会无法匹配任何邮件\n",
    "}\n",
    "params = {\"ignore_unavailable\": \"true\"}\n",
    "\n",
    "res1 = es.search(index=\"index1\", body=query,params=params)\n",
    "print(res1)\n",
    "print('\\n')\n",
    "for hit in res1['hits']['hits']:\n",
    "    print(\"From: %(From)s  To:%(To)s ,%(X-cc)s\" % hit[\"_source\"],\"id:%s\"%hit[\"_id\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47d35e7b",
   "metadata": {},
   "source": [
    "# 2. 核心代码：索引设计与构建\n",
    "&emsp;这一部分的代码对上一小节实现的转化函数em2js进行了一定修改，实现了get_pa函数对整个maildir目录进行遍历，构建全部邮件的索引。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "814179e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(517401, [])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "from email.parser import Parser\n",
    "from elasticsearch.client import Elasticsearch\n",
    "from elasticsearch import helpers\n",
    "from datetime import datetime\n",
    "\n",
    "es = Elasticsearch([{'host':'127.0.0.1','port':9200}], timeout=3600)\n",
    "#es.indices.delete('em-index1') 删除索引\n",
    "\n",
    "def em2js(path,id_num): #本函数将Email转换为可以被es接受的json格式\n",
    "    keyname=['Message-ID','Date','From','To','Subject','Mime-Version','X-From','X-To','X-Folder','X-Origin','X-FileName',\"path\"]\n",
    "    # keyname：邮件中的一些关键信息的名称\n",
    "    if os.path.exists(path):\n",
    "        with open(path,errors='ignore') as fp: #忽略乱码的垃圾邮件\n",
    "            em=fp.read()\n",
    "            header=Parser().parsestr(em)\n",
    "            jres={} #用来存储转换结果\n",
    "            klen=len(keyname)\n",
    "            for i in range(klen-1):\n",
    "                item={keyname[i]:header[keyname[i]]}\n",
    "                jres.update(item)\n",
    "            item0={\"path\":path} # 把邮件的存储地址也存下来\n",
    "            jres.update(item0)\n",
    "            #es.index(index=\"em-index1\", id=id_num, document=jres)\n",
    "            #es.indices.refresh(index=\"em-index1\")\n",
    "            return jres\n",
    "    else:\n",
    "        print(\"no file!\")\n",
    "        \n",
    "\n",
    "def get_pa(path,alldoc,nu,doc): \n",
    "    \"\"\"\n",
    "    这一函数实现对path路径下所有合法文件的遍历读取\n",
    "    返回所有文件路径的列表alldoc\n",
    "    \n",
    "    path：待读目录所在路径\n",
    "    nu：读取到的第nu个文件\n",
    "    doc：邮件转化成的json文件集合\n",
    "    \"\"\"\n",
    "    tot = os.listdir(path) #文件列表\n",
    "    for pi in tot:\n",
    "        tpath=os.path.join(path+\"/\",pi)\n",
    "        if os.path.isfile(tpath):\n",
    "            alldoc.append(tpath)#每找到一个文件，就把它的路径存下\n",
    "            doc.append(em2js(tpath,nu)) #存下待建立索引的json文本\n",
    "            nu+=1\n",
    "        else:\n",
    "            get_pa(tpath,alldoc,nu,doc)#遇到文件夹则继续递归\n",
    "    return alldoc\n",
    "\n",
    "pa=os.path.join(\"maildir\")\n",
    "id_num=1\n",
    "doc0=[]\n",
    "a=get_pa(pa,[],id_num,doc0) # a为所有文件路径\n",
    "actions=[] \n",
    "#批量导入\n",
    "for i in range(len(doc0)):\n",
    "    action={'_op_type':'index',#操作参数可选：index插入文件 update更新索引 create建立索引 delete删除内容  \n",
    "            '_index':'em-index1',#索引名称\n",
    "            '_type':'doc',  #类型\n",
    "            '_source':doc0[i]}\n",
    "    actions.append(action)\n",
    "helpers.bulk(client=es,actions=actions,index='em-index1')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f1fc2b12",
   "metadata": {},
   "source": [
    "对建立好的索引进行一个简单的测试："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "147df7f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/all_documents/1730    id: IW05HX0BgZQWTQ3PXK47\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/all_documents/1761    id: Qm05HX0BgZQWTQ3PXK47\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/discussion_threads/1784    id: W205HX0BgZQWTQ3PXbam\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/discussion_threads/1794    id: Zm05HX0BgZQWTQ3PXbam\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/sent/1050    id: 6m05HX0BgZQWTQ3PX71v\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/sent/1081    id: DG05HX0BgZQWTQ3PX75v\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/_sent_mail/1148    id: q205HX0BgZQWTQ3PYMT2\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/_sent_mail/1179    id: y205HX0BgZQWTQ3PYMT2\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/_sent_mail/37    id: AG05HX0BgZQWTQ3PYcdM\n",
      "From: eric.bass@enron.com  To:chance.rabon@enron.com ,maildir/bass-e/_sent_mail/6    id: 9G05HX0BgZQWTQ3PYcdM\n",
      "\n",
      "\n",
      "Got 15 Hits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-37-1502a9859601>:18: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
      "  res2 = es.search(index=\"em-index1\", body=query2,params=params)\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from email.parser import Parser\n",
    "from elasticsearch.client import Elasticsearch\n",
    "from elasticsearch import helpers\n",
    "from datetime import datetime\n",
    "\n",
    "es = Elasticsearch([{'host':'127.0.0.1','port':9200}], timeout=3600)\n",
    "\n",
    "query2 = {\"query\": \n",
    "          {\"bool\":\n",
    "           {\"must\":\n",
    "            [ {\"match_phrase\":{\"From\" : \"eric.bass@enron.com\"}},\n",
    "             {\"match_phrase\":{\"To\" : \"chance.rabon@enron.com\"}},{\"match_phrase\":{\"Subject\" : \"Rebooks\"}}] }}\n",
    "          ,\"size\":  10 \n",
    "         }\n",
    "\n",
    "params = {\"ignore_unavailable\": \"true\"}\n",
    "res2 = es.search(index=\"em-index1\", body=query2,params=params)\n",
    "#print(res2)\n",
    "for hit in res2['hits']['hits']:\n",
    "    print(\"From: %(From)s  To:%(To)s ,%(path)s\" % hit[\"_source\"],\"   id: %s\"%hit[\"_id\"])\n",
    "print('\\n')\n",
    "print(\"Got %d Hits\" % res2['hits']['total']['value'])\n",
    "# es默认显示10条数据，在“size”处修改内容\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e45e52a",
   "metadata": {},
   "source": [
    "# 3.检索功能设计\n",
    "&emsp;这部分代码实现了邮件检索的图形用户界面，可以按照收件人、发件人、标题等内容进行邮件检索。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "29e172d7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-39-20cbf3c9c197>:70: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
      "  res2 = es.search(index=\"em-index1\", body=query,params=params)\n"
     ]
    }
   ],
   "source": [
    "import tkinter as tk\n",
    "import tkinter.messagebox\n",
    " \n",
    "m_dialog = tk.Tk()\n",
    "m_dialog.minsize(350,100)# 初始大小为350×100\n",
    "text1 = tk.Text(m_dialog,width=80) # 查询结果显示框\n",
    "\n",
    "# 发件邮箱输入\n",
    "fmFrom = tk.Frame(m_dialog)\n",
    "fmFrom.pack(side=\"top\")\n",
    "txtFrom = tk.Label(fmFrom, text=\"发件邮箱 : \")\n",
    "txtFrom.pack(side=\"left\")\n",
    "entryFrom = tk.Entry(fmFrom, bd=3,width=30)\n",
    "entryFrom.pack(side=\"right\")\n",
    "\n",
    "# 收件邮箱输入\n",
    "fmTo = tk.Frame(m_dialog)\n",
    "fmTo.pack()\n",
    "txtTo = tk.Label(fmTo, text=\"收件邮箱 : \")\n",
    "txtTo.pack(side=\"left\")\n",
    "entryTo = tk.Entry(fmTo, bd=3,width=30)\n",
    "entryTo.pack(side=\"right\")\n",
    "\n",
    "# 邮件主题输入\n",
    "fmSbj = tk.Frame(m_dialog)\n",
    "fmSbj.pack()\n",
    "txtSbj = tk.Label(fmSbj, text=\"邮件标题 : \")\n",
    "txtSbj.pack(side=\"left\")\n",
    "entrySbj = tk.Entry(fmSbj, bd=3,width=30)\n",
    "entrySbj.pack(side=\"right\")\n",
    "\n",
    "# 邮件ID输入\n",
    "fmID = tk.Frame(m_dialog)\n",
    "fmID.pack(side=\"top\")\n",
    "txtID = tk.Label(fmID, text=\"邮件ID   : \")\n",
    "txtID.pack(side=\"left\")\n",
    "entryID = tk.Entry(fmID, bd=3,width=30)\n",
    "entryID.pack(side=\"right\")\n",
    "\"\"\"\n",
    "\n",
    "eric.bass@enron.com\n",
    "\n",
    "chance.rabon@enron.com\n",
    "\n",
    "Rebooks\n",
    "\n",
    "\"\"\"\n",
    "def searchm():\n",
    "    text1.delete(0.0,\"end\")\n",
    "    if entryFrom.get() == \"\" and entryTo.get() == \"\" and entrySbj.get() == \"\" and entryID.get()==\"\":\n",
    "        print(tkinter.messagebox.showinfo(\"Result\", \"No message.\"))  # 未输入查询信息时进行提醒\n",
    "    else:\n",
    "        fro0 = entryFrom.get() # 发件人邮箱\n",
    "        to0 = entryTo.get() # 收件人邮箱\n",
    "        sbj0 = entrySbj.get() # 邮件主题\n",
    "        id0=entryID.get() #邮件的Message-ID\n",
    "        \n",
    "        # 若对应查询栏为空，则忽略这一信息的约束，只对其他内容进行查询\n",
    "        query = {\"query\": {\"match_phrase\":{\"From\" : fro0}},\"size\":  100}\n",
    "        \n",
    "        if (fro0==\"\"and to0!=\"\"and sbj0==\"\"):  # 仅指定了收件人\n",
    "            query = {\"query\": {\"match_phrase\":{\"To\" : to0}},\"size\":  100} \n",
    "        if (fro0==\"\"and to0==\"\"and sbj0!=\"\"):  # 仅指定了邮件主题\n",
    "            query = {\"query\": {\"match_phrase\":{\"Subject\" : sbj0}},\"size\":  100} \n",
    "        if (fro0!=\"\"and to0!=\"\"and sbj0==\"\"):  # 仅指定了收发人\n",
    "            query = {\"query\": {\"bool\":{\"must\":[ {\"match_phrase\":{\"From\" : fro0}},{\"match_phrase\":{\"To\" : to0}} ] }},\"size\":  100}\n",
    "        if (fro0!=\"\"and to0==\"\"and sbj0!=\"\"):  # 仅指定了发件人和主题\n",
    "            query = {\"query\": {\"bool\":{\"must\":[ {\"match_phrase\":{\"From\" : fro0}},{\"match_phrase\":{\"Subject\" : sbj0}} ] }},\"size\":  100}\n",
    "        if (fro0==\"\"and to0!=\"\"and sbj0!=\"\"):  \n",
    "            query = {\"query\": {\"bool\":{\"must\":[ {\"match_phrase\":{\"Subject\" : sbj0}},{\"match_phrase\":{\"To\" : to0}} ] }},\"size\":  100}\n",
    "        if (fro0!=\"\"and to0!=\"\"and sbj0!=\"\"):\n",
    "            query = {\"query\": {\"bool\":{\"must\":[ {\"match_phrase\":{\"From\" : fro0}},{\"match_phrase\":{\"To\" : to0}},{\"match_phrase\":{\"Subject\" : sbj0}} ] }},\"size\":  100}\n",
    "        if(id0!=\"\"): # 只要指定了ID，就不再需要其他信息，ID可以唯一确定一封邮件\n",
    "            query = {\"query\": {\"match_phrase\":{\"Message-ID\" : id0}},\"size\":  100}\n",
    "        \n",
    "        params = {\"ignore_unavailable\": \"true\"}\n",
    "        res2 = es.search(index=\"em-index1\", body=query,params=params)\n",
    "        num = 0\n",
    "        for hit in res2['hits']['hits']: #显示查询结果\n",
    "            num += 1\n",
    "            text1.insert(\"insert\", (\"Some messages about mail-%d\"%num+\":\\n  ID: %(Message-ID)s\\n  From: %(From)s   To:%(To)s \\n  path_in_folder: %(path)s\"%hit[\"_source\"]))\n",
    "            text1.insert(\"insert\",\"\\n\\n\")\n",
    "        text1.insert(\"end\", \"\")\n",
    "        text1.pack()\n",
    "\n",
    "\n",
    "button = tk.Frame(m_dialog)\n",
    "button.pack(side=\"bottom\") # 添加一个search按钮\n",
    "searchb = tk.Button(\n",
    "    button,\n",
    "    text=\"search\",\n",
    "    activeforeground=\"white\",\n",
    "    activebackground=\"black\",\n",
    "    width=\"15\",\n",
    "    command=searchm\n",
    ")\n",
    "searchb.pack(side=\"left\")\n",
    "\n",
    "m_dialog.mainloop()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a497dab",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
